## vector with package names
x <- c("pbapply", "viridis", "knitr", "kableExtra", "ggplot2", "ape", "rwty")

aa <- lapply(x, function(y) {
  
  # check if installed, if not then install 
  if (!y %in% installed.packages()[,"Package"]) 
    install.packages(y) 

  # load package
  try(require(y, character.only = T), silent = T)
})
knitr::opts_knit$set(root.dir = normalizePath(".."))

knitr::opts_chunk$set(dpi = 38, fig.width = 18, fig.height = 10, echo = TRUE) 

options(knitr.kable.NA = '')

theme_set(theme_light(base_size = 30))
source('./source/custom_treespace.R')

# read data to order leks by number of years sampled
fs <- read.csv("./data/processed/fossil_series.csv", header = T, sep = ",")
years_by_lek <- aggregate(Sampled ~ Lek, data = fs, FUN = sum)
years_by_lek <- years_by_lek[order(years_by_lek$Sampled, decreasing = TRUE), ]
rb.trees <- readRDS("./output/revbayes_output_in_single_R_object.RDS")

rb.trees <- rb.trees[grep("run_", names(rb.trees), invert = TRUE, value = TRUE)]

trees_diags <- data.frame(model = names(rb.trees))

# get lek name
trees_diags$lek <- sapply(strsplit(trees_diags$model, "_", fixed = TRUE), "[", 1)

# substitution model
trees_diags$subs <- sapply(strsplit(trees_diags$model, "_", fixed = TRUE), "[", 2)

# period
trees_diags$period <- sapply(strsplit(trees_diags$model, "_", fixed = TRUE), "[", 3)

# and which fossils were used
trees_diags$fossils <- sapply(strsplit(trees_diags$model, "_", fixed = TRUE), "[", 4)

trees_diags$align <- sapply(strsplit(trees_diags$model, "_", fixed = TRUE), "[", 5)

trees_diags$n_trees <- sapply(rb.trees, function(x) length(x$trees))

# Number of models by iterations and lek
kbl <- knitr::kable(as.matrix(table(trees_diags$n_trees, trees_diags$lek)), caption = "Number of models by lek and number of trees")

kableExtra::kable_styling(kbl)
Number of models by lek and number of trees
BR1 CCE HC1 SUR TR1
3002 18 0 24 6 18
6002 0 24 0 24 0

Tree spaces by leks

All fossils using pairwise shared tips

  • 100 trees for each model evenly spaced along the chain
  • Some trees were removed if NAs were produced when comparing topologies (i.e. non-comparable topologies)
#selected leks
sel_leks <- c("SUR", "CCE", "HC1", "BR1", "TR1")

tree_names <- grep("early", names(rb.trees), invert = TRUE, value = TRUE)

pnts_lks <- lapply(sel_leks, function(i) {
  
  print(i)
  lek_trees <- grep(i, tree_names, value = TRUE)
  
  multi_list <- lapply(lek_trees, function(x) read.tree(file.path("./output/most_recent_revbayes_models/", x), keep.multi = TRUE))
  
  names(multi_list) <- sapply(basename(lek_trees), function(x) paste(strsplit(x, "_", fixed = TRUE)[[1]][1:5], collapse = "_"))
  
  pnts <- try(custom_treespace(multi_list, n.points = 100, method = "RF", cl = 10), silent = TRUE)

  if (!is(pnts, "try-error")) 
    return(pnts) else
      return(NULL)
})

names(pnts_lks) <- sel_leks

sapply(pnts_lks, is.null)

saveRDS(pnts_lks, file = "./output/topology/tree_distance_all_fossils_pairwise_shared.RDS")

New and old data sets

pnts_lks <- readRDS("./output/topology/tree_distance_all_fossils_pairwise_shared.RDS")

# fix model names
pnts_lks <- lapply(pnts_lks, function(x) {
  
  # rename alignment models
  x$chain <- gsub("all.equal", "MAFFT-agnostic", x$chain) 
  x$chain <- gsub("optimal", "MAFFT-optimal", x$chain) 
  x$chain <- gsub("prank", "PRANK-TN93", x$chain) 
  x$chain <- gsub("new", "recent records", x$chain) 
  x$chain <- gsub("old", "historical records", x$chain)
  x$chain <- gsub("Uexp", "relaxed", x$chain)
  x$chain <- gsub("_all", "", x$chain)
  x$chain <- gsub("_", "/", x$chain)
  
  return(x)
})

ggs <- lapply(pnts_lks, function(X) {

    ggplot(data = X, aes(x = x, y = y, fill = generation)) +
    geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
    scale_colour_gradient(low = "red", high = "yellow") +
    geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_fill_gradientn(colours = viridis(256)) +
      labs(x = "MDS_v1", y = "MDS_v2") + 
      ggtitle(label = sprintf("Tree space for %d trees", nrow(X) / length(unique(X$chain)))) 
    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1

## 
## $TR1

Old data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("historical records", X$chain), ]   
  
  if (nrow(X) > 0)
  ggplot(data = X, aes(x = x, y = y, fill = generation)) +
    geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
    scale_colour_gradient(low = "red", high = "yellow") +
    geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_fill_gradientn(colours = viridis(256)) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %1.0f trees", nrow(X) / length(unique(X$chain))))
    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1
## NULL
## 
## $TR1
## NULL

New data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("recent records", X$chain), ]   
  
  ggplot(data = X, aes(x = x, y = y, fill = generation)) +
    geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
    scale_colour_gradient(low = "red", high = "yellow") +
    geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_fill_gradientn(colours = viridis(256)) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %d trees", nrow(X) / length(unique(X$chain))))

    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1

## 
## $TR1

Clumping molecular clocks

Old data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("historical records", X$chain), ]   
  X$clock <- ifelse(grepl("global", X$chain), "global", "relaxed")
  X$chain <- gsub("/global|/relaxed", "", X$chain)
  
    if (nrow(X) > 0)
    ggplot(data = X, aes(x = x, y = y)) +
    geom_path(alpha = 0.1, size = 0.75, show.legend = FALSE) +
    geom_point(aes(shape = clock,  col = clock), size = 4) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_color_viridis_d(begin = 0.2, end = 0.8, alpha = 0.75) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %d trees", nrow(X) / length(unique(X$chain))))

    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1
## NULL
## 
## $TR1
## NULL

New data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("recent records", X$chain), ]   
   X$clock <- ifelse(grepl("global", X$chain), "global", "relaxed")
  X$chain <- gsub("/global|/relaxed", "", X$chain)
  
  ggplot(data = X, aes(x = x, y = y)) +
    geom_path(alpha = 0.1, size = 0.75, show.legend = FALSE) +
    geom_point(aes(shape = clock,  col = clock), size = 4) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_color_viridis_d(begin = 0.2, end = 0.8, alpha = 0.75) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %d trees", nrow(X) / length(unique(X$chain))))

    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1

## 
## $TR1

Early fossils using only pairwise shared tips

#selected leks
sel_leks <- c("SUR", "CCE", "HC1", "BR1", "TR1")
# sel_leks <- c("SUR", "TR1")

tree_names <- grep("early", names(rb.trees), value = TRUE)

pnts_lks <- lapply(sel_leks, function(i) {
  
  # print(i)
  lek_trees <- grep(i, tree_names, value = TRUE)
  
  multi_list <- lapply(lek_trees, function(x) read.tree(file.path("./output/most_recent_revbayes_models/", x), keep.multi = TRUE))
  
  names(multi_list) <- sapply(basename(lek_trees), function(x) paste(strsplit(x, "_", fixed = TRUE)[[1]][1:5], collapse = "_"))
  
  pnts <- try(custom_treespace(multi_list, n.points = 100, method = "RF", cl = 10), silent = TRUE)

  if (!is(pnts, "try-error")) 
    return(pnts) else
      return(NULL)
})

names(pnts_lks) <- sel_leks

sapply(pnts_lks, is.null)

saveRDS(pnts_lks, file = "./output/topology/tree_distance_early_fossils_pairwise_shared.RDS")

Recent and historical data sets

pnts_lks <- readRDS("./output/topology/tree_distance_early_fossils_pairwise_shared.RDS")

# fix model names
pnts_lks <- lapply(pnts_lks, function(x) {
  
  # rename alignment models
  x$chain <- gsub("all.equal", "MAFFT-agnostic", x$chain) 
  x$chain <- gsub("optimal", "MAFFT-optimal", x$chain) 
  x$chain <- gsub("prank", "PRANK-TN93", x$chain) 
  x$chain <- gsub("new", "recent records", x$chain) 
  x$chain <- gsub("old", "historical records", x$chain)
  x$chain <- gsub("Uexp", "relaxed", x$chain)
  x$chain <- gsub("_all", "", x$chain)
  # x$chain <- gsub("_", "/", x$chain)
  
  return(x)
})

ggs <- lapply(pnts_lks, function(X) {

    ggplot(data = X, aes(x = x, y = y, fill = generation)) +
    geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
    scale_colour_gradient(low = "red", high = "yellow") +
    geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_fill_gradientn(colours = viridis(256)) +
      labs(x = "MDS_v1", y = "MDS_v2") + 
    ggtitle(label = sprintf("Tree space for %1.0f trees", nrow(X) / length(unique(X$chain)))) 

    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1

## 
## $TR1

New data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("recent records", X$chain), ]   
  
  ggplot(data = X, aes(x = x, y = y, fill = generation)) +
    geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
    scale_colour_gradient(low = "red", high = "yellow") +
    geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_fill_gradientn(colours = viridis(256)) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %1.0f trees", nrow(X) / length(unique(X$chain))))
  })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1

## 
## $TR1

Old data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("historical records", X$chain), ]   
  
  if (nrow(X) > 0)
  ggplot(data = X, aes(x = x, y = y, fill = generation)) +
    geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
    scale_colour_gradient(low = "red", high = "yellow") +
    geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_fill_gradientn(colours = viridis(256)) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %1.0f trees", nrow(X) / length(unique(X$chain))))
  })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1
## NULL
## 
## $TR1
## NULL

Clumping molecular clocks

Old data set

ggs <- lapply(pnts_lks, function(X) {
     
  X <- X[grep("historical records", X$chain), ]   
  X$clock <- ifelse(grepl("global", X$chain), "Global", "relaxex")
  X$chain <- gsub("/global|/relaxed|/Uexp", "", X$chain)
  
    if (nrow(X) > 0)
    ggplot(data = X, aes(x = x, y = y)) +
    geom_path(alpha = 0.1, size = 0.75, show.legend = FALSE) +
    geom_point(aes(shape = clock,  col = clock), size = 4) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_color_viridis_d(begin = 0.2, end = 0.8, alpha = 0.75) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %1.0f trees", nrow(X) / length(unique(X$chain))))

    })

ggs
## $SUR

## 
## $CCE

## 
## $HC1

## 
## $BR1
## NULL
## 
## $TR1
## NULL

New data set

ggs <- lapply(pnts_lks, function(X) {
  
  X <- X[grep("recent records", X$chain), ]   
  if (X)
  X$clock <- ifelse(grepl("global", X$chain), "Global", "relaxed")
  X$chain <- gsub("/global|/relaxed|/Uexp", "", X$chain)
  
  ggplot(data = X, aes(x = x, y = y)) +
    geom_path(alpha = 0.1, size = 0.75, show.legend = FALSE) +
    geom_point(aes(shape = clock,  col = clock), size = 4) +
      theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
      panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
    facet_wrap(~chain, nrow = 4) +
      scale_color_viridis_d(begin = 0.2, end = 0.8, alpha = 0.75) +
      labs(x = "MDS_v1", y = "MDS_v2") +
    ggtitle(label = sprintf("Tree space for %1.0f trees", nrow(X) / length(unique(X$chain))))

    })

ggs

All data available (all fossils, old and new periods), relaxed molecular clock, pairwise shared tips

#selected leks
sel_leks <- c("SUR", "CCE", "HC1", "BR1", "TR1")

# relax molecular clock
tree_names <- grep("global", names(rb.trees), value = TRUE)

# all data for leks with historical data
# una figura por lek, comparando los tres alineamientos y usando el reloj molecular relajado y "all" fossils (o early si es que hay un problema sacando árboles con puntas compartidas para "all"). Para los leks BR1 y TR1 esto sería obviamente sólo con los fósiles recientes. Para SUR, CCE y HC1 mejor incluyendo a los old gapped.

tree_names <- tree_names[grepl("BR1|TR1", tree_names) & grepl("_all_", tree_names) | grepl("SUR|CCE|HC1", tree_names) & !grepl("early|new|no.fossils", tree_names)]


pnts_lks <- lapply(sel_leks, function(i) {
  
  print(i)
  lek_trees <- grep(i, tree_names, value = TRUE)
  
  multi_list <- lapply(lek_trees, function(x) read.tree(file.path("./output/most_recent_revbayes_models/", x), keep.multi = TRUE))
  
  names(multi_list) <- sapply(basename(lek_trees), function(x) paste(strsplit(x, "_", fixed = TRUE)[[1]][1:5], collapse = "_"))
  
  pnts <- try(custom_treespace(multi_list, n.points = 100, method = "RF", cl = 10), silent = TRUE)

  if (!is(pnts, "try-error")) 
    return(pnts) else
      return(NULL)
})

names(pnts_lks) <- sel_leks

sapply(pnts_lks, is.null)


# put all in a single data frame
pnts_lks_df <- do.call(rbind, pnts_lks)

saveRDS(pnts_lks_df, file = "./output/topology/tree_distance_all_data_by_lek_and_alignment.RDS")
# read data
pnts_lks_df <- readRDS("./output/topology/tree_distance_all_data_by_lek_and_alignment.RDS")

# rename alignment models
pnts_lks_df$chain[grep("all.equal", pnts_lks_df$chain)] <- "MAFFT-agnostic"
pnts_lks_df$chain[grep("optimal", pnts_lks_df$chain)] <- "MAFFT-optimal"
pnts_lks_df$chain[grep("prank", pnts_lks_df$chain)] <- "PRANK-TN93"


ggplot(data = pnts_lks_df, aes(x = x, y = y, fill = generation)) +
  geom_path(alpha = 0.25, aes(colour = generation), size = 0.75, show.legend = FALSE) +
  scale_colour_gradient(low = "red", high = "yellow") +
  geom_point(shape = 21, size = 4, colour = "white", alpha = 0.6) +
    theme(panel.background = element_blank(), axis.line = element_line(color = "grey"),
    panel.spacing = unit(0.1, "lines"), axis.title.x = element_text(vjust = -0.5), axis.title.y = element_text(vjust = 1.5)) +
  facet_grid(scales = "free", cols = vars(chain), rows = vars(lek)) +
    scale_fill_gradientn(colours = viridis(256)) +
    labs(x = "MDS_v1", y = "MDS_v2") + 
    ggtitle(label = sprintf("Tree space for %d trees", nrow(pnts_lks_df) / length(unique(pnts_lks_df$chain)))) + 
    theme_light(base_size = 30)

R session information

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=es_CR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
##  [5] LC_MONETARY=es_CR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
##  [7] LC_PAPER=es_CR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] rwty_1.0.2        ape_5.5           ggplot2_3.3.3     kableExtra_1.3.4 
## [5] knitr_1.33        viridis_0.6.1     viridisLite_0.4.0 pbapply_1.4-3    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6         svglite_2.0.0      lattice_0.20-41    digest_0.6.27     
##  [5] utf8_1.2.1         R6_2.5.0           plyr_1.8.6         evaluate_0.14     
##  [9] coda_0.19-4        httr_1.4.2         highr_0.9          pillar_1.6.0      
## [13] rlang_0.4.11       rstudioapi_0.13    jquerylib_0.1.4    phangorn_2.7.0    
## [17] Matrix_1.3-2       rmarkdown_2.8      labeling_0.4.2     webshot_0.5.2     
## [21] stringr_1.4.0      igraph_1.2.6       munsell_0.5.0      compiler_4.0.5    
## [25] xfun_0.22          pkgconfig_2.0.3    systemfonts_1.0.2  htmltools_0.5.1.1 
## [29] tidyselect_1.1.1   tibble_3.1.1       gridExtra_2.3      quadprog_1.5-8    
## [33] codetools_0.2-18   reshape_0.8.8      fansi_0.4.2        crayon_1.4.1      
## [37] dplyr_1.0.6        withr_2.4.2        MASS_7.3-53.1      grid_4.0.5        
## [41] nlme_3.1-152       jsonlite_1.7.2     GGally_2.1.1       gtable_0.3.0      
## [45] lifecycle_1.0.0    magrittr_2.0.1     scales_1.1.1       stringi_1.6.1     
## [49] farver_2.1.0       reshape2_1.4.4     xml2_1.3.2         bslib_0.2.4       
## [53] ellipsis_0.3.2     ggdendro_0.1.22    generics_0.1.0     vctrs_0.3.8       
## [57] fastmatch_1.1-0    RColorBrewer_1.1-2 tools_4.0.5        glue_1.4.2        
## [61] purrr_0.3.4        parallel_4.0.5     yaml_2.2.1         colorspace_2.0-1  
## [65] rvest_1.0.0        sass_0.3.1